********************************************************************************
*		May 2022 — prepare data files for analysis
********************************************************************************

clear programs
clear all
set more off

capture program drop main
program define main
	disp "Calling subprograms"
	Accounts_All
	Accounts_wide
	kwh_challenges
	clean_kwh_challenges
	kwh_eventstudy
	BalanceTPS01Accounts
end //end defining the main program


*-------------------------------------------------------------------------------
* Import consumption data from match_kwh, save list of unique id's to analyze
*-------------------------------------------------------------------------------
cd "$datafrom"
use "match_kwh", clear
cd "$GeneralModified"
keep id
duplicates drop id,force
cd "$GeneralModified"
save uniqueids,replace

*-------------------------------------------------------------------------------
* 2) Merge account and challenge data
capture program drop Accounts_All
program define Accounts_All
*-------------------------------------------------------------------------------
cd "$datafrom"
use match_accounts,clear

*merge BC Hydro's measurement of the target and total kWh for challenges.
merge m:1 challenge_id_masked start_date end_date using match_challenges_record
tab _merge
//_merge==1: accounts in match_accounts with no challenge record (should be all tps==0, in theory)
//_merge==2: challenge records for which I don't have an id -> drop these
//_merge==3: successful matches; have record of the BCH account and their challenges.

drop if _merge==2
//drop all the challenge records for which I don't have id's for
drop _merge
unique id
//40,000 unique id's with challenge records. I only have kWh data for ~30,000 of these.

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*Merge match_account data with kWh data; keep only accounts that exist in the kWh consumption 
cd "$GeneralModified"
merge m:1 id using uniqueids,keep(3) nogen
//this will keep only the unique ids with each matched to an account record

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*Address the known issue where BC matched the wrong challenge identificatiton id's to customer id's
duplicates list
duplicates tag challenge_id_masked if tps==1,gen(dup_chalidmask)
duplicates tag challenge_id_masked start_date end_date target_kwh_num total_kwh_num degree_value_num historical_total_kwh_num if tps==1,gen(dup_chalidvars)

tab dup_chalidmask dup_chalidvars
//92 are duplicate challenges
//id's of 20001, 20003 are not incldued in this, because they they ahve unique challenge id's. 

duplicates report challenge_id_masked start_date end_date target_kwh_num total_kwh_num degree_value_num historical_total_kwh_num if tps==1
duplicates drop challenge_id_masked start_date end_date target_kwh_num total_kwh_num degree_value_num historical_total_kwh_num if tps==1,force
//drops 46 surplus observations
drop dup*

*some challenges are identical but have different challenge_id_masked: these need to be removed
duplicates report id start_date end_date target_kwh_num total_kwh_num degree_value_num historical_total_kwh_num
duplicates drop id start_date end_date target_kwh_num total_kwh_num degree_value_num historical_total_kwh_num,force

*Dataset should have no duplicate observations with the same challenge start date for an id, in either id or challenge_id_masked, at this point
duplicates report id start_date

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* A number of challenges begin in 2016, after the kwh data ends in 2015. This causes the households to be listed as having say 3 challenges, even if there is a 4th begin_challenge_4 due to starting a challenge in the future
*e.g. br if inlist(id,20021,20027,20033,20043)

*clear any challenges that begin after 2015m12. only challenges beginning during the 10 year panel are considered.
*br if start_date>=date("01jan2016","DMY") & tps==1
*unique id if start_date>=date("01jan2016","DMY") & tps==1
drop if start_date>=date("01jan2016","DMY") & tps==1
*this removes challenges that begin after the panel of kWh data.

duplicates list challenge_id_masked if tps==1
duplicates list id start_date end_date
duplicates list id start_date
//no duplicates

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*Check the order of the challenges is correct.
bysort id (start_date): gen challenge_order=_n if tps==1
bysort id: egen num_challenges=max(challenge_order)

*Verify the challenge_order properly matches the challenge_number
tab challenge_number challenge_order if tps==1
br if challenge_number==1 & challenge_order==2
//challenge_number occassionaly repeats. challenge_number is provided by BC Hydro and isn't a unique identifier I can use

*check that the challenge_order always matches the date order
sort id challenge_order
tsset id start_date

gen ERROR=0
by id: replace ERROR=1 if challenge_order[_n]-challenge_order[_n-1]!=1 & _n!=1
//no changes made so challenge order fully captures the ordering of challenges that is in start_date
//This means I can use the challenge_order to capture the time series ordering of when a challenge occurs
drop ERROR

tab challenge_order,miss
tab challenge_order if tps==0,miss
//no observations & challenge_order==0 doesn't appear. This means i can set challenge_order=0 to reflect tps==0
replace challenge_order=0 if challenge_order==. & tps==0
tab challenge_order,miss

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* BCH defines two sets of 'pass' or 'fail.' Need to use the final pass/fail status shown to households after the final weather correction is applied
*Compare challenge_status and challenge_status2, and status_cd
tab challenge_status challenge_status2,miss
tab challenge_status status_cd
tab challenge_status2 status_cd
tab challenge_status challenge_status2 if tps==1,miss

*Accounts to drop
tab challenge_status,miss

*For CANCELLED
tab id if challenge_status=="CANCELLED"
br if id==20565
//only the last challenge is cancelled, so remove that single challenge
drop if id==20565 & challenge_id_masked==2924

*For SUSPEND
br if challenge_status=="SUSPEND"
//9 challenges have suspend for a challenge
capture drop temp
gen temp=1 if challenge_status=="SUSPEND"
bysort id: egen todrop=max(temp)
unique id if todrop==1
//9 accounts
br if todrop==1

drop temp
drop todrop
drop if challenge_status=="SUSPEND"

*For ACTIVE
tab challenge_status,miss

rename challenge_status2 challenge_status_two
//can't use challenge_status2 as this conflicts with the renaming done in reshape

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
duplicates report id start_date
duplicates report id challenge_order
//no duplicate id-challenges

*Move the start_date and end_date to be the beginning and end of months.
*define the start_date_adj based on when in a month the start_date occured, then 
*define the end_date_adju as 12 months after that date to avoid defining years as 13 months

rename start_date start_date_day
rename end_date end_date_day

gen start_date=mofd(start_date_day) if day(start_date_day)<=15
replace start_date=mofd(start_date_day)+1 if day(start_date_day)>15
gen end_date=start_date+11
format %tm start_date end_date

*drop any challenges where the start date is in 2016 or after
drop if start_date>=mofd(date("01jan2016","DMY")) & tps==1

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*Re label the success indicators
gen success=.
replace success=0 if challenge_status=="FAIL"
replace success=1 if challenge_status=="PASS"
replace success=2 if challenge_status=="ACTIVE"
replace success=3 if challenge_status=="COMPLETE"
	
gen success2=.	
replace success2=0 if challenge_status_two=="3-Fail"
replace success2=1 if challenge_status_two=="1-Pass"
replace success2=2 if challenge_status_two=="2-Save"
replace success2=3 if challenge_status_two=="" & tps==1

gen status=.
replace status=0 if status_cd=="FAIL"
replace status=1 if status_cd=="PASS"
replace status=2 if status_cd=="CANCELLED"
replace status=3 if status_cd==""

*handle missing obs
replace success=4 if success==.
replace success2=4 if success2==. 

label define success 0 "Fail" 1 "Pass" 2 "Active" 3 "Complete" 4 "NoChal",replace
label define success2 0 "Fail" 1 "Pass" 2 "Save" 3 "Blank" 4 "NoChal",replace
label define status 0 "Fail" 1 "Pass" 2 "Cancelled" 3 "NoChal",replace

label val success success
label val success2 success2
label val status status

tab success if tps==1
tab success success2
tab success status

*5) Drop unneeded variables
drop challenge_status* status_cd
//all info now recorded in success, success2, and status
sort id challenge_order

cd "$GeneralModified"
save Accounts_All,replace

end

*-------------------------------------------------------------------------------
*3) Wide version of account information
capture program drop Accounts_wide
program define Accounts_wide
*-------------------------------------------------------------------------------
cd "$GeneralModified"
use Accounts_All,replace

rename start_date begin_challenge_

keep if tps==1
keep id challenge_order begin_challenge_ end_date

reshape wide begin_challenge_ end_date,i(id) j(challenge_order)
*begin_challenge_# is the same as start_date#

*Generate number of months between challenges
forvalues i=1/8 {
	local i2=`i'+1
	gen reduc_months_`i2'=begin_challenge_`i2'-end_date`i'-1
}

*define the shortgap indicators
replace reduc_months_2=0 if reduc_months_2==.
replace reduc_months_3=0 if reduc_months_3==.
replace reduc_months_4=0 if reduc_months_4==.
replace reduc_months_5=0 if reduc_months_5==.
replace reduc_months_6=0 if reduc_months_6==.
replace reduc_months_7=0 if reduc_months_7==.
replace reduc_months_8=0 if reduc_months_8==.
replace reduc_months_9=0 if reduc_months_9==.

gen shortgap=1
replace shortgap=0 if reduc_months_2>3
replace shortgap=0 if reduc_months_3>3
replace shortgap=0 if reduc_months_4>3
replace shortgap=0 if reduc_months_5>3
replace shortgap=0 if reduc_months_6>3
replace shortgap=0 if reduc_months_7>3
replace shortgap=0 if reduc_months_8>3
replace shortgap=0 if reduc_months_9>3

capture drop shortgapV2
gen shortgapV2=0
forvalues chalnum=2/9 {
	*Verify that all reduc_months are < the level.
	replace shortgapV2=reduc_months_`chalnum' if reduc_months_`chalnum'>shortgapV2
}
*shorgapV2 will be the maximum gap across challenge numbers

cd "$GeneralModified"
save Accounts_wide,replace

end

*-------------------------------------------------------------------------------
*Merge in the challenge_order info to the kwh file
capture program drop kwh_challenges
program define kwh_challenges
*-------------------------------------------------------------------------------
cd "$GeneralModified"
use Accounts_All,clear

duplicates list challenge_id_masked challenge_order if tps==1
duplicates tag challenge_id_masked challenge_order,gen(dupID)
tab dupID
tab dupID if tps==1
*There should be no duplicate id's for TPS==1.
drop dupID
duplicates report id start_date
//no duplicates

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
keep if tps==1
keep id challenge_order start_date end_date

*Reshape so that each challenge_order and start_date and end_date is a separate column based on the challeng_order id
*this will allow iterating through match_kwh and generating an indicator for whether a household is in a challenge (and which challenge) at that date

rename start_date begin_challenge_

reshape wide begin_challenge_ end_date,i(id) j(challenge_order)

cd "$datafrom"
merge 1:m id using match_kwh
tab _merge if tps==0
br if _merge==2 & tps==1

unique id if _merge==2 & tps==1
//these are households in the match_kwh data, but not in the Accounts_All file.
//most of these are challenges and households with active or ended challenges

drop if _merge==2 & tps==1
unique id if tps==1

drop _merge
sort id date
*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
*Generate the reduc indicator for being within a challenge
gen reduc=0
forvalues x=1/9 {
	replace reduc=`x' if date >= begin_challenge_`x' & date <= end_date`x'
}

*check number of obs by id-reduc
bysort id reduc: gen temp=_N if reduc!=0
tab date if temp!=12 & temp!=0
br if date==monthly("Feb-11", "MY", 2050) & temp!=. & temp!=12

br if id==21286
//has a single reduc 2 challenge that lists a single month. This is because the next challenge starts 1 month after the first
drop if id==21286
//remaining challenges without 12 months are those that occur at the end of the sample period so I don' thave 12 months of kwh data 

keep id tps date kwh reduc

cd "$GeneralModified"
save kwh_challenges,replace

end

*-------------------------------------------------------------------------------
*Clean data - primarily outliers and adjust months, duplicates were removed previously
capture program drop clean_kwh_challenges
program define clean_kwh_challenges
*-------------------------------------------------------------------------------
cd "$GeneralModified"
use kwh_challenges.dta, clear

*** Make a weighted kwh that would be the consumption if each month had 30 days. This counts for various days in a month.
*Method: IF (date==in month with daysinmonth days) THEN generate 30 day average KWH:
*kwhw = kwh/daysinmonth*30
gen month=month(dofm(date))
gen year=year(dofm(date))

gen kwhw=.
replace kwhw=kwh*(30/31) if inlist(month,1,3,5,7,8,10,12)
//31 day months
replace kwhw=kwh*(30/30) if inlist(month,4,6,9,11)
//30 day months
replace kwhw=kwh*(30/28) if inlist(month,2) & inlist(year,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016,2020)!=1
replace kwhw=kwh*(30/29) if inlist(month,2) & inlist(year,1980,1984,1988,1992,1996,2000,2004,2008,2012,2016,2020)
//February
*br if kwhw==.
//Should find no missing months.
*br
*Dataset should be cleaned and ready for estimations
//keep kwh to see if this explains the problem I have in predicting success

*See VerifyandExploreData for info on outliers
/*
hist kwhw
hist kwhw if kwhw<5000
*/

*Remvoe outliers
bysort id: egen idave = mean(kwhw)
*hist idave
*hist idave if idave<5000
su idave,detail
unique id
unique id if idave>4419
//drop accounts with idave more than 5 standard deviations from the mean.
//this is 75 accounts.

drop if idave>4419
drop idave

*hist kwhw
//can still see some extreme outliers that should be removed.
//This requires removing the entire panel for that id with an outlier

*Test for single month outliers and remove id's they correspond to.
*Generate an indicator if an id contains >=1 obs with that kwhw value
capture drop temp
capture drop max_kwhw
bysort id: gen temp=1 if kwhw>5000
bysort id: egen max_kwhw=max(temp)
unique id
unique id if max_kwhw==1
//this loses a lot of id's, possibly 686 total
unique id if tps==1
unique id if tps==1 & max_kwhw==1
unique id if tps==1 & max_kwhw==.
//lose roughly 1.3% of tps accounts for a cutoff of 5000

su kwhw,detail
//drop those over 5000kwh, which is >5 sigma

*Drop accounts that are particularly large consumers of electricity
drop if max_kwhw==1
drop max_kwhw
unique id if tps==1
drop month year temp

gen lnkwhw=ln(kwhw)

bysort id: egen num_challenges=max(reduc)

cd "$GeneralModified"
save kwh_challenges,replace

end

*-------------------------------------------------------------------------------
*Generate event study indictors
capture program drop kwh_eventstudy
program define kwh_eventstudy
*-------------------------------------------------------------------------------

cd "$GeneralModified"
use kwh_challenges,clear

*Code: generates a t_* indicator for years lag or lead the first challenge.
*The b* and a* refer only to the observations before the first challenge or after the LAST challenge
*This means comparing single challenge vs multiple challenge housholds won't allow a stacking of estimated
*coefficients for a* based on years post-first-challenge (only years post-last challenge)
*The solution is to generate a different set of indicators that is years post first challenge and which skips the gaps between challenges

cd "$GeneralModified"
rename reduc challenge_order
merge m:1 id using Accounts_wide,keepusing(begin_challenge_* end_date*) keep(1 3) nogen
rename challenge_order reduc

*Procedure: 
*1 generate the t_a0 for all challenges
*2 Iterate up through challenges setting t_a1== 2nd challenge, etc
*3 once all challenge years are recorded in the indicators set the lags by individual challenge
*4 set the pre-challenge indicators
*5 replace =0 for ==.
*6 set gap indicators. Should gap indicators be separated into multiple years? YES? Is this already done?

*Define the t_b indicators for previous periods
// min date 2006m1
// max date 2015m12

// first challenge 2007m1
// last challenge 2016m8

// so will need a full panel of pre-challenge indicators, so 10
// will need at t_a0 (2007) to t_a8 to cover to 2015

tsset id date
bysort id reduc (date): gen firstchalmonth=1 if _n==1 & reduc==1

forvalues i=1/10 {
	gen byte t_b`i'=.
}
tsset
forvalues i=1/12 {
	replace t_b1=1 if F`i'.firstchalmonth==1
	local b2=`i'+12
	replace t_b2=1 if F`b2'.firstchalmonth==1
	local b3=`i'+24
	replace t_b3=1 if F`b3'.firstchalmonth==1
	local b4=`i'+36
	replace t_b4=1 if F`b4'.firstchalmonth==1
	local b5=`i'+48
	replace t_b5=1 if F`b5'.firstchalmonth==1
	local b6=`i'+60
	replace t_b6=1 if F`b6'.firstchalmonth==1
	local b7=`i'+72
	replace t_b7=1 if F`b7'.firstchalmonth==1
	local b8=`i'+84
	replace t_b8=1 if F`b8'.firstchalmonth==1
	local b9=`i'+96
	replace t_b9=1 if F`b9'.firstchalmonth==1
	local b10=`i'+108
	replace t_b10=1 if F`b10'.firstchalmonth==1
}
drop firstchalmonth

*To generate indicators for the gaps between challenges I will need a way of
* telling the code that the specific date is between challenges for that id
*easiest way is to use the start_date# and end_date# information


*drop start_date_day end_date_day start_date end_date

*Set first challenge t_a0
gen t_a0=1 if reduc==1

*id
*br if inlist(id,20021,20027,20033,20043)
*id date tps reduc begin_challenge_* end_date*
 
*for those households with additional challenges set indicators for all challenge years after the first
forvalues chalnum=1/8 {
	gen t_a`chalnum' = 1 if reduc==`chalnum'+1
	local nextchal = `chalnum'+1
	*Generate gap indicators. Defined as reduc_gap_2 is gap before 2nd challenge
	*reduc_gap* defined by begin_challenge_2 and begin_challenge_1
	bysort id: gen reduc_gap_`chalnum'=1 if date>(end_date`chalnum') & date<begin_challenge_`nextchal' & begin_challenge_`nextchal'!=.
	*& begin_challenge_`chalnum'!=. & begin_challenge_`numchal'!=.
}

*if a household does not have a challenge after the first (or second, etc) then still need to define t_a*
*Create 1-7 lags for years past the last challenge, whichever one that is. So for num_challenges==3 the first lag would be for t_a3

tsset
forvalues x = 1(1)8 {
	local lagval=`x'*12
	replace t_a`x'=L`lagval'.t_a0 if num_challenges==1
}

*For 2 challenges
forvalues x = 2(1)8 {		
		local lagval=(`x'-1)*12
		disp "`lagval'"
		replace t_a`x'=L`lagval'.t_a1 if num_challenges==2
}

*For 3 challenges
forvalues x = 3(1)8 {		
		local lagval=(`x'-2)*12
		replace t_a`x'=L`lagval'.t_a2 if num_challenges==3
}

*For 4 challenges
forvalues x = 4(1)8 {		
		local lagval=(`x'-3)*12
		replace t_a`x'=L`lagval'.t_a3 if num_challenges==4
}

*For 5 challenges
forvalues x = 5(1)8 {		
		local lagval=(`x'-4)*12
		replace t_a`x'=L`lagval'.t_a4 if num_challenges==5
}

*For 6 challenges
forvalues x = 6(1)8 {		
		local lagval=(`x'-5)*12
		replace t_a`x'=L`lagval'.t_a5 if num_challenges==6
}

*For 7 challenges
forvalues x = 7(1)8 {		
		local lagval=(`x'-6)*12
		replace t_a`x'=L`lagval'.t_a6 if num_challenges==7
}

*For 8 challenges
forvalues x = 8(1)8 {		
		local lagval=(`x'-7)*12
		replace t_a`x'=L`lagval'.t_a7 if num_challenges==8
}

*Replace all missing obs with 0's
foreach lname of varlist t_* {
	replace `lname'=0 if `lname'==.
}

foreach lname of varlist reduc_gap_* {
	replace `lname'=0 if `lname'==.
}


*Reorder variables to make them t_b* then t_a0 then t_a1 etc
order t_b8 t_b7 t_b6 t_b5 t_b4 t_b3 t_b2 t_b1,before(t_a0)

*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
*Generate the monthly indicators
*- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

disp "Monthly indicators"
sort id date

forvalues x=1/12 {
	bysort id t_a0 (date) : gen byte t_a0_fm`x'=t_a0 if _n==`x' & t_a0==1
	replace t_a0_fm`x'=0 if t_a0_fm`x'==.
}

forvalues z=1/10 {
	forvalues x=1/12 {
		local fmNum=12-`x'+1
		bysort id t_b`z' (date) : gen byte t_b`z'_fm`fmNum'=t_b`z' if _n==_N-`x'+1 & t_b`z'==1
		replace t_b`z'_fm`fmNum'=0 if t_b`z'_fm`fmNum'==.
	}
}
	
*For t_a*
forvalues z=1/8 {
	forvalues x=1/12 {
		bysort id t_a`z' (date) : gen byte t_a`z'_fm`x'=t_a`z' if _n==`x' & t_a`z'==1
		replace t_a`z'_fm`x'=0 if t_a`z'_fm`x'==.
	}
}

*-------------------------------------------------------------------------------
*Generate the t_t indicators
*-------------------------------------------------------------------------------
*ta0 and then 8 post years. So 96 indicators beyond t_a0?
*Generate 96 months post-treatment, plus indicator for after that
*start from the last month of reduc==1
br id date tps reduc if tps==1
bysort id reduc (date): gen lastmonth=1 if _n==12 & reduc==1
tsset id date
capture drop t_t*
forvalues i=1/96 {
	bysort id: gen byte t_t_m`i'=L`i'.lastmonth
	replace t_t_m`i'=0 if t_t_m`i'==.
}

cd "$GeneralModified"
save kwh_eventstudy,replace

end

*-------------------------------------------------------------------------------
* Balance households between the TPS=0 and TPS==1 accounts
capture program drop BalanceTPS01Accounts
program define BalanceTPS01Accounts
*-------------------------------------------------------------------------------
*Merge the unique_buildingchars_class data with Accounts_all so I can plot trends in use
cd "$GeneralModified"
use kwh_eventstudy,clear

cd "$datafrom"
merge m:1 id using unique_buildingchars_class,keepusing(heating sitecode buildingtype)
cd "$GeneralModified"
keep if _merge==3
drop _merge

cd "$GeneralModified"
*merge m:1 id using Accounts_wide,keepusing(begin_challenge_1 begin_challenge_2 shortgap*) keep(1 3) nogen
merge m:1 id using Accounts_wide,keepusing(shortgap*) keep(1 3) nogen

drop if tps==1 & num_challenges==0
//households with no challenges within the date period (they start too late or close to the end of 2015)
*unique id if tps==1 & num_challenges==0
*tab begin_challenge_1 if tps==1 & num_challenges==0
*These show that tps==1 but no challenge is because they start the challenges recently and so have no challenge record

*-------------------------------------------------------------------------------
*Time trends by building type - demonstrate that TPS 0 and TPS 1 are similar if composition is accounted for
*-------------------------------------------------------------------------------
*Time trend for all accounts
bysort date tps: egen tps01ave_use=mean(kwhw)
bysort date tps: gen uni_date_tps01=1 if _n==1

twoway line tps01ave_use date if uni_date_tps01==1 & tps==0 || line tps01ave_use date if uni_date_tps01==1 & tps==1, ///
	legend(label(1 "Non-Participants") label(2 "RBP Participants")) ///
	xtitle(" ") ytitle(Monthly kWh) title(All Accounts)
*BALANCE ACCOUNTS B.1
graph rename TimeTrend_AllAccounts,replace

bysort date tps buildingtype: egen tps01ave_use_bt=mean(kwhw)
bysort date tps buildingtype: gen uni_date_bt_tps01=1 if _n==1

twoway line tps01ave_use_bt date if uni_date_bt_tps01==1 & tps==0 || line tps01ave_use_bt date if uni_date_bt_tps01==1 & tps==1,by(buildingtype) ///
	legend(label(1 "Non-Participants") label(2 "RBP Participants")) ///
	xtitle(" ") ytitle(Monthly kWh) title(By Building Type)
*BALANCE ACCOUNTS B.2
graph rename TimeTrend_ByBuildingType,replace
//can see building type doesn't totally remove the difference betwen tps=0 and 1, especially for 2 Sty Sfd.

*Time trends by buildilngtype and heating
bysort date tps buildingtype heating: egen tps01ave_use_bt_ht=mean(kwhw)
bysort date tps buildingtype heating: gen uni_date_bt_ht_tps01=1 if _n==1

twoway line tps01ave_use_bt_ht date if uni_date_bt_ht_tps01==1 & tps==0 & heating==0 || line tps01ave_use_bt_ht date if uni_date_bt_ht_tps01==1 & tps==1 & heating==0,by(buildingtype) ///
	legend(label(1 "Non-Participants") label(2 "RBP Participants")) ///
	xtitle("Heating==0") ytitle(Monthly kWh)
*BALANCE ACCOUNTS B.3.1
graph rename TimeTrendByTypeHeating0,replace

twoway line tps01ave_use_bt_ht date if uni_date_bt_ht_tps01==1 & tps==0 & heating==1 || line tps01ave_use_bt_ht date if uni_date_bt_ht_tps01==1 & tps==1 & heating==1,by(buildingtype) ///
	legend(label(1 "Non-Participants") label(2 "RBP Participants")) ///
	xtitle("Heating==1") ytitle(Monthly kWh)
*BALANCE ACCOUNTS B.3.2
graph rename TimeTrendByTypeHeating1
//these show very good overlap between types; this implies that participants and non-participants might be good controls

capture drop uni_date*
capture drop tps01*

*-------------------------------------------------------------------------------
*Restrict to households starting their first and second Challenges to the $75 period
*-------------------------------------------------------------------------------
bysort id: gen uniid=1 if _n==1
// tab begin_challenge_1 if uniid==1
// tab begin_challenge_2 if uniid==1

*Restrict to households that start their first Challenge on or before 2013m2.
*This gives them 1 year of the challenge, ending 2014m2, and 6 months to decide whether to start a 2nd challenge, before 2014m8 when the program changed
*It gives me a 2007 to 2013 -> 6 years of start dates
*Define "balancedset" as balanced accounts for start date <2013m2. Why this date? 
*Define "balancedset_sg" as balanced accounts for shortgap==1 & start date <2013m2
*Define "balancedset_sg6" as balanced accounts for shortgapV2<=6 & start date <2013m2: 4810 accounts
*Define "balancedset_all" as the tps==0 for ALL tps==1 accounts, independent of start date: balancedset_all has 9817 participants
*balancedset_all & begin_challenge_1<=monthly("2014m8","YM") restricts to 8877 accounts, which is used for estimating the full DID results.
*this is the largest set of accounts I should use. 
*Define "balancedset_change" as the balanced accounts for start date <=2014m8, after which is when the challenge was updated. Weather adjustment appears to have occured in 2014m1: FRD results cannot use accounts after this month. 
*This allows me to not worry about the c1 vs c2om selection when I check both begin_challenge_1 and begin_challenge_2 separately. 

*unique id if tps==1 & begin_challenge_1<=monthly("2014m8","YM") & shortgapV2<=12: 7117 accounts

gen reward=.
replace reward=75 if begin_challenge_1<=monthly("2013m2","YM")
*replace reward=75 if begin_challenge_2<=monthly("2014m8","YM")
*replace reward=50 if begin_challenge_1>monthly("2014m8","YM")
*replace reward=50 if begin_challenge_2>monthly("2014m8","YM")

// tab begin_challenge_1 if uniid==1
// tab begin_challenge_1 if uniid==1 & reward==75
// //lose about 2000 hosueholds
// tab uniid if tps==1 & shortgap==1 & reward==75
// //lose a lot of my shortgap sample: from 6,400 to 4,000
// tab uniid if tps==1 & shortgapV2<=3 & reward==75
// tab uniid if tps==1 & shortgapV2<=6 & reward==75
//recover abotu 800 increasing the gap to 6 months
replace reward=0 if tps==0
replace reward=50 if tps==1 & reward==.

*-------------------------------------------------------------------------------
* Generate balanced set of TPS==0 accounts
*-------------------------------------------------------------------------------		
// tab buildingtype heating if uniid==1 & tps==1,miss
// tab buildingtype heating if uniid==1 & tps==0,miss
// tab buildingtype heating if uniid==1 & tps==0,miss nolabel

*Generate balancedset==1 for accounts starting before 2013mw
tab buildingtype heating if uniid==1 & tps==1 & reward==75,miss matcell(bt_ht)
*Frequencies are in matrix list bt_ht

capture drop balancedset
gen balancedset=1 if tps==1 & reward==75
local i=0
foreach bt of numlist 1 2 3 4 5 999 {
	local i=`i'+1
	disp "`i'"
	local num_ht0=bt_ht[`i',1]
	local num_ht1=bt_ht[`i',2]
	local num_htm=bt_ht[`i',3]
	disp "`num'"
	bysort uniid buildingtype heating tps reward: replace balancedset=1 if buildingtype==`bt' & _n<=`num_ht0' & uniid==1 & tps==0 & heating==0 
	bysort uniid buildingtype heating tps reward: replace balancedset=1 if buildingtype==`bt' & _n<=`num_ht1' & uniid==1 & tps==0 & heating==1
	bysort uniid buildingtype heating tps reward: replace balancedset=1 if buildingtype==`bt' & _n<=`num_htm' & uniid==1 & tps==0 & heating==.
}

*Repeat for shortgap==1 & reward==75
tab buildingtype heating if uniid==1 & tps==1 & shortgap==1 & reward==75,miss matcell(bt_ht_sg)
capture drop balancedset_sg
gen balancedset_sg=1 if tps==1 & shortgap==1 & reward==75
local i=0
foreach bt of numlist 1 2 3 4 5 999 {
	local i=`i'+1
	disp "`i'"
	local num_ht0=bt_ht_sg[`i',1]
	local num_ht1=bt_ht_sg[`i',2]
	local num_htm=bt_ht_sg[`i',3]
	disp "`num'"
	bysort uniid buildingtype heating tps: replace balancedset_sg=1 if buildingtype==`bt' & _n<=`num_ht0' & uniid==1 & tps==0 & heating==0
	bysort uniid buildingtype heating tps: replace balancedset_sg=1 if buildingtype==`bt' & _n<=`num_ht1' & uniid==1 & tps==0 & heating==1
	bysort uniid buildingtype heating tps: replace balancedset_sg=1 if buildingtype==`bt' & _n<=`num_htm' & uniid==1 & tps==0 & heating==.
}


*Repeat for shortgapV2<=6 & reward==75
tab buildingtype heating if uniid==1 & tps==1 & shortgapV2<=6 & reward==75,miss matcell(bt_ht_sg)
capture drop balancedset_sg6
gen balancedset_sg6=1 if tps==1 & shortgapV2<=6 & reward==75
local i=0
foreach bt of numlist 1 2 3 4 5 999 {
	local i=`i'+1
	disp "`i'"
	local num_ht0=bt_ht_sg[`i',1]
	local num_ht1=bt_ht_sg[`i',2]
	local num_htm=bt_ht_sg[`i',3]
	disp "`num'"
	bysort uniid buildingtype heating tps: replace balancedset_sg6=1 if buildingtype==`bt' & _n<=`num_ht0' & uniid==1 & tps==0 & heating==0
	bysort uniid buildingtype heating tps: replace balancedset_sg6=1 if buildingtype==`bt' & _n<=`num_ht1' & uniid==1 & tps==0 & heating==1
	bysort uniid buildingtype heating tps: replace balancedset_sg6=1 if buildingtype==`bt' & _n<=`num_htm' & uniid==1 & tps==0 & heating==.
}

*Repeat for shortgapV2<=12 & reward==75

*Repeat for balancedset_all, which is the balanced set for all tps==1 accounts
tab buildingtype heating if uniid==1 & tps==1,miss matcell(bt_ht_sg)
capture drop balancedset_all
gen balancedset_all=1 if tps==1
local i=0
foreach bt of numlist 1 2 3 4 5 999 {
	local i=`i'+1
	disp "`i'"
	local num_ht0=bt_ht_sg[`i',1]
	local num_ht1=bt_ht_sg[`i',2]
	local num_htm=bt_ht_sg[`i',3]
	disp "`num'"
	bysort uniid buildingtype heating tps: replace balancedset_all=1 if buildingtype==`bt' & _n<=`num_ht0' & uniid==1 & tps==0 & heating==0
	bysort uniid buildingtype heating tps: replace balancedset_all=1 if buildingtype==`bt' & _n<=`num_ht1' & uniid==1 & tps==0 & heating==1
	bysort uniid buildingtype heating tps: replace balancedset_all=1 if buildingtype==`bt' & _n<=`num_htm' & uniid==1 & tps==0 & heating==.
}

// tab buildingtype heating if uniid==1 & tps==0,miss
// tab buildingtype heating if uniid==1 & tps==0 & balancedset==1,miss
// tab buildingtype heating if uniid==1 & tps==1 & balancedset==1,miss

// tab buildingtype heating if uniid==1 & tps==0 & balancedset_sg==1,miss
// tab buildingtype heating if uniid==1 & tps==1 & balancedset_sg==1,miss

// tab buildingtype heating if uniid==1 & tps==0 & balancedset_sg6==1,miss
// tab buildingtype heating if uniid==1 & tps==1 & balancedset_sg6==1,miss

// tab buildingtype heating if uniid==1 & tps==0 & balancedset_all==1,miss
// tab buildingtype heating if uniid==1 & tps==1 & balancedset_all==1,miss

*Assign balancedset==1 to all obs within an id
capture drop temp
bysort id: egen temp=mean(balancedset)
replace balancedset=temp
drop temp

capture drop temp
bysort id: egen temp=mean(balancedset_sg)
replace balancedset_sg=temp
drop temp

capture drop temp
bysort id: egen temp=mean(balancedset_sg6)
replace balancedset_sg6=temp
drop temp

capture drop temp
bysort id: egen temp=mean(balancedset_all)
replace balancedset_all=temp
drop temp

replace balancedset=0 if balancedset==.
replace balancedset_sg=0 if balancedset_sg==.
replace balancedset_sg6=0 if balancedset_sg6==.
replace balancedset_all=0 if balancedset_all==.

// tab buildingtype heating if tps==0 & balancedset==1,miss
// tab buildingtype heating if tps==1 & balancedset==1,miss
// tab buildingtype heating if tps==0 & balancedset_sg==1,miss
// tab buildingtype heating if tps==1 & balancedset_sg==1,miss
// tab buildingtype heating if tps==0 & balancedset_all==1,miss
// tab buildingtype heating if tps==1 & balancedset_all==1,miss

*-------------------------------------------------------------------------------
* Relabel missing heating indicator as 2.
*-------------------------------------------------------------------------------
replace heating=2 if heating==.

*-------------------------------------------------------------------------------
* Generate the balancedset_change as a subset of balancedset_all - equal 8877 accounts, with begin_challenge_1<=monthly("2014m8","YM")
*-------------------------------------------------------------------------------

capture drop balancedset_change
gen balancedset_change=1 if tps==1 & begin_challenge_1<=monthly("2014m8","YM")
tab buildingtype heating if uniid==1 & balancedset_change==1,miss matcell(bt_ht)
local i=0
	foreach bt of numlist 1 2 3 4 5 999 {
	local i=`i'+1
	disp "`i'"
	local num_ht0=bt_ht[`i',1]
	local num_ht1=bt_ht[`i',2]
	local num_htm=bt_ht[`i',3]
	disp "`num'"
	bysort uniid buildingtype heating tps: replace balancedset_change=1 if buildingtype==`bt' & _n<=`num_ht0' & uniid==1 & tps==0 & heating==0
	bysort uniid buildingtype heating tps: replace balancedset_change=1 if buildingtype==`bt' & _n<=`num_ht1' & uniid==1 & tps==0 & heating==1
	bysort uniid buildingtype heating tps: replace balancedset_change=1 if buildingtype==`bt' & _n<=`num_htm' & uniid==1 & tps==0 & heating==2
}

tab tps if balancedset_change==1 & uniid==1

capture drop temp
bysort id: egen temp=mean(balancedset_change)
replace balancedset_change=temp
drop temp

replace balancedset_change=0 if balancedset_change==.
tab tps if balancedset_change==1

*-------------------------------------------------------------------------------
* Clean data set for easier use
*-------------------------------------------------------------------------------

*Recast data
recast byte reduc_gap* shortgap balancedset balancedset_sg balancedset_sg6 balancedset_all
capture drop uni_date*
capture drop tps01*

*-------------------------------------------------------------------------------
* Save the balanced dataset
*-------------------------------------------------------------------------------	
cd "$GeneralModified"
*use "BalancedData",clear
save "BalancedData",replace

end

*========================================================================
*Call the main program to run this file.
*========================================================================
main

